#### R Code for C. follicularis manuscript ####
### Cephalotus analysis and comparisons with Sarracenia and Nepenthes

### Load necessary packages
if (!require("ggplot2")) {install.packages("ggplot2"); require("ggplot2")}
if (!require("vegan")) {install.packages("vegan"); require("vegan")}
if (!require("picante")) {install.packages("picante"); require("picante")}
if (!require("dplyr")) {install.packages("dplyr"); require("dplyr")}
if (!require("lme4")) {install.packages("lme4"); require("lme4")}
if (!require("effects")) {install.packages("effects"); require("effects")}
if (!require("coin")) {install.packages("coin"); require("coin")}
if (!require("usedist")) {install.packages("usedist"); require("usedist")}
if (!require("metacoder")) {install.packages("metacoder"); require("metacoder")}
if (!require("tibble")) {install.packages("tibble"); require("tibble")}
if (!require("ggpubr")) {install.packages("ggpubr"); require("ggpubr")}
if (!require("DHARMa")) {install.packages("DHARMa"); require("DHARMa")}
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("phyloseq")
require("phyloseq")
# if (!requireNamespace("devtools", quietly = TRUE)){install.packages("devtools")}
# devtools::install_github("jbisanz/qiime2R")
require("qiime2R")



## Set to your working directory
setwd("YOUR_WORKING_DIRECTORY")


#### Read in data, metadata, taxonomy, and tree ####
asv16s <- read.csv("Cephalotus_Project_16S_ASV_table.csv",head=T,row.names=1) #read in ASV table
summary(rowSums(asv16s))
summary(colSums(asv16s)) 

### Metadata
md <- read.csv("Cephalotus_Project_Metadata.csv", head=T, row.names = 1, stringsAsFactors = T)
md <- md[order(row.names(md)),] # order samples alphabetically
md16s <- subset(md, row.names(md) %in% colnames(asv16s))
colnames(asv16s) == row.names(md16s) # sanity check

### Tree
asv16s.tree <- read.tree("sepp-tree-16S.nwk")
asv16s.tree <-root(asv16s.tree, "df3352436328528f16804b663387a2e1") ## root with an archaeon

### read in taxonomy
tax.16s <- read.csv("Cephalotus_Project_16S_taxonomy.csv", head=T, row.names = 1)
row.names(asv16s) == row.names(tax.16s) # sanity check

#### Clean up data ####
### remove chloroplasts
tax.16s2 <- tax.16s[grep("Chloroplast",tax.16s$Taxon, invert = T),]

asv16s <- subset(asv16s, row.names(asv16s) %in% row.names(tax.16s2))
sort(colSums(asv16s))

asv16s[asv16s < 10] <- 0 # each observation needs at least 10 seqs
asv16s <- asv16s[rowSums(asv16s) > 0,]
summary(rowSums(asv16s))
summary(colSums(asv16s)) 

asv16s <- asv16s[,colSums(asv16s) > 999] # each sample needs at least 1000 seqs

asv16s.t <- t(asv16s) # transpose rows and columns
asv16s.t <- asv16s.t[order(row.names(asv16s.t)),] # order samples alphabetically
asv16s.t <- asv16s.t[,order(colnames(asv16s.t))] # order asvs alphabetically

summary(rowSums(asv16s.t))
summary(colSums(asv16s.t)) 

md16s <- subset(md16s, row.names(md16s) %in% row.names(asv16s.t)) # One sample removed

##### Cephalotus Analysis #####
#### Plots of categories by site #####
md.cep <- subset(md, md$Type_2=="Cephalotus")
md.cep <- droplevels(md.cep)

#test for normality
shapiro.test(md.cep$pH) #p-value = 0.000125, not normal
shapiro.test(md.cep$DNA_conc) #p-value = 0.0004597, not normal
shapiro.test(md.cep$Volume) #p-value = 0.0008151, not normal

#visual inspection
ggqqplot(md.cep$pH)
ggqqplot(md.cep$DNA_conc)
ggqqplot(md.cep$Volume)

## Volume by Site
wilcox_test(Volume~Site, data = md.cep)

ggplot(data=md.cep, aes(x=Site, y=Volume)) +
  geom_violin(trim = F, draw_quantiles = 0.5) +
  ylab("Pitcher fluid volume (mLs)") +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 0.1, dotsize = .7) +
  theme_classic()

## pH by site
wilcox_test(pH~Site, data = md.cep)

## pH by fluid color
plot(pH ~ Fluid_color, data = md.cep)
kruskal.test(md.cep$pH, md.cep$Fluid_color)

## DNA concentration by site
wilcox_test(DNA_conc~Site, data = md.cep)

## Fluid color by site
plot(Fluid_color ~ Site, data = md.cep)
fisher.test(md.cep$Site, md.cep$Fluid_color) ## too few samples/category for chi-square test, so using Fisher's exact test here

## Pitcher fluid pH by volume
lm1 <- lm(pH ~ Volume, data=md.cep)
summary(lm1)

## Pitcher fluid pH by DNA concentration
plot(pH ~ DNA_conc, data=md.cep)
lm2 <- lm(pH ~ DNA_conc, data=md.cep)
summary(lm2)

## Pitcher volume by DNA concentration
lm3 <- lm(Volume ~ DNA_conc, data=md.cep)
summary(lm3)


#### PCA of the environmental variables
cep.env <- md.cep[,c(9:11)]
cep.pca <- rda(cep.env, scale = T)

# Calculate the percent of variance explained by first two axes
sum((as.vector(cep.pca$CA$eig)/sum(cep.pca$CA$eig))[1:2]) # 82%, this is good.

# In a biplot of a PCA, species' scores are drawn as arrows 
# that point in the direction of increasing values for that variable
biplot(cep.pca, choices = c(1,2), type = c("text", "text")) # biplot of axis 1 vs 2

#### Subset to Cephalotus 16S ####
cep16s <- subset(asv16s.t, md16s$Type_2=="Cephalotus")
cep16s <- cep16s[,colSums(cep16s) > 0] ## remove ASVs no longer present
sum(colSums(cep16s)) # 334,718 sequences across 36 samples; 709 ASVs
summary(rowSums(cep16s)) # Mean of 9,298 sequences per sample
summary(colSums(cep16s))

### Cephalotus 16S rarefaction curve, for Supplementary Figure
rarecurve(cep16s, step = 10, label = FALSE, main = "16S rarefaction")
abline(v = 4039, col = "red", lwd = 2)

cep16s <- rrarefy(cep16s,4039) ## rarefy at higher level
cep16s <- cep16s[,colSums(cep16s) > 0] ## remove ASVs no longer present

md16s.cep <- subset(md16s, row.names(md16s) %in% row.names(cep16s))
md16s.cep <- droplevels(md16s.cep)

row.names(cep16s) == row.names(md16s.cep)


#### 16S Cephalotus alpha diversity ####
cep16s.shannon <- diversity(cep16s)
cep16s.ef <- exp(cep16s.shannon)
summary(cep16s.ef)
md16s.cep <- cbind(md16s.cep, effective_species = cep16s.ef)
md16s.cep <- within(md16s.cep, Fluid_color <- relevel(Fluid_color, ref = 2))
md16s.cep$Fluid_color

### GLM with a gamma distribution
glm1.gamma <- glm(effective_species ~ pH + Volume + Site + Fluid_color + DNA_conc, family = Gamma(link="log"), data=md16s.cep)
summary(glm1.gamma)

plot(allEffects(glm1.gamma))

### Add in 18S effective species, need to first run 18S code
md16s.cep <- cbind(md16s.cep,es18s=md18s.cep.sub$effective_species)

##### GLM used in the paper 
glm2.gamma <- glm(effective_species ~ pH + Volume + Site + Fluid_color + DNA_conc + es18s, data=md16s.cep, family = Gamma(link="log"))

summary(glm2.gamma) ### Data for Table 1

plot(allEffects(glm2.gamma)) ## For Supplementary Figure

### Checking the assumptions for GLM with gamma
#goodness-of-fit of the model with a chi-square test based on the residual deviance and degrees of freedom
1 - pchisq(summary(glm2.gamma)$deviance, summary(glm2.gamma)$df.residual) 
#model does fit the data p>0.05

#Gamma model deviance
qchisq(0.95, df.residual(glm2.gamma)) #38.88514
deviance(glm2.gamma) #9.475929 =less than the 5% critical threshold, model fit is good

plot(glm2.gamma)

check_gamma_model <- simulateResiduals(fittedModel = glm2.gamma, n = 500)
plot(check_gamma_model)

###Assumptions for Gamma GLM are met:
# Heteroscedastic, residuals are not normally distributed
#The data  are independently distributed, cases are independent.
#The dependent variable (not normally distributed), fits a a gamma distribution to model (continuous, non-zero, non-negative, positively skewed data)
#A GLM does NOT assume a linear relationship between the response variable and the explanatory variables, but it does assume a linear relationship between the transformed expected response in terms of the link function (log link in this case)
#Errors need to be independent but NOT normally distributed.  Our residuals vs fitted plot shows no relationship between the two

## Plot for Fig 1D
ggplot(data=md16s.cep, aes(x=es18s, y=effective_species)) +
  geom_point() +
  geom_smooth(method = "lm") +
  xlab("Effective species of eukaryotes") +
  ylab("Effective species of bacteria") +
  theme_classic()

lmef <- lm(es18s ~ effective_species, data=md16s.cep)
summary(lmef)

## Plot for Fig 1C
ggplot(data=md16s.cep, aes(x=Site, y=effective_species)) +
  geom_violin(trim = F, draw_quantiles = 0.5) +
  ylab("Effective species of bacteria") +
  ylim(0,60) +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1, dotsize = .8) +
  theme_classic()


### 16S Cephalotus NMDS ####
## Subset tree and taxonomy to relevant samples, put in format for phyloseq
cep16s.tree <- prune.sample(cep16s, asv16s.tree)
cep16s.tax <- subset(tax.16s2, row.names(tax.16s2) %in% colnames(cep16s))
cep16s.tax1 <- data.frame(Feature.ID=row.names(cep16s.tax),Taxon=cep16s.tax[,1])
cep16s.tax2 <- parse_taxonomy(cep16s.tax1)
cep16s.tax3 <- cbind(ASVs=row.names(cep16s.tax2),cep16s.tax2)

TAX.cep16s <- tax_table(as.matrix(cep16s.tax3))
OTU.cep16s = otu_table(as.matrix(cep16s), taxa_are_rows = FALSE)
SAM.cep16s = sample_data(md16s.cep)
cep16s.physeq <-merge_phyloseq(phyloseq(OTU.cep16s),SAM.cep16s,TAX.cep16s,cep16s.tree)



### Unweighted UniFrac
## Calculate unweighted Unifrac distance and run NMDS
cep16s.uu.dist <- distance(cep16s.physeq,"uUniFrac")
cep16s.uu.nmds <- metaMDS(cep16s.uu.dist, trymax=500)

## Plot for Fig 2A
plot(cep16s.uu.nmds$points[,1:2], xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     main="Cephalotus bacterial community by site", 
     col= c("red","blue")[md16s.cep$Site],
     pch=19)
legend("topleft", 
       legend=c("GRR","TPB"),
       col= c("red","blue"),
       pch=19,
       cex=0.8,
       bty = "n")
ordispider(cep16s.uu.nmds,groups = md16s.cep$Site, show.groups = "GRR", col = "red")
ordispider(cep16s.uu.nmds,groups = md16s.cep$Site, show.groups = "TPB", col = "blue")


ordi.16s.cep.vol <- ordisurf(cep16s.uu.nmds,md16s.cep$Volume, add=F, col="darkgray")
summary(ordi.16s.cep.vol)

ordiplot(cep16s.uu.nmds, type = "t",display = "sites",cex = .6) # Plot that shows names. Change cex to make names bigger or smaller

pH.f <- as.factor(md16s.cep$pH)
## Plot for Fig. 2C
plot(cep16s.uu.nmds$points[,1:2], xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     main="Cephalotus bacterial community by pH", 
     col= rainbow(9)[pH.f],
     pch=c(19,17)[md16s.cep$Site])
legend("topleft", 
       legend=c("3.5","3.75","4","4.25","4.5","4.75","5","6.5"),
       col= rainbow(9),
       pch=19,
       cex=0.6,
       bty = "n")
ordi.16s.cep.pH <- ordisurf(cep16s.uu.nmds,md16s.cep$pH, add=T, col="darkgray")
summary(ordi.16s.cep.pH)


#### 16s Cephalotus adonis and mantel tests ####
cep16s.ad <- adonis(cep16s ~ Fluid_color * Site, data=md16s.cep)
cep16s.ad

pH.dist <- vegdist(md16s.cep$pH,method="euclidean")
cep16s.pH.uu.man <- mantel(cep16s.uu.dist,pH.dist, method = "spearman", permutations=999)
cep16s.pH.uu.man

vol.dist <- vegdist(md16s.cep$Volume,method="euclidean")
cep16s.vol.uu.man <- mantel(cep16s.uu.dist,vol.dist, method = "spearman", permutations=999)
cep16s.vol.uu.man

DNAconc.dist <- vegdist(md16s.cep$DNA_conc,method="euclidean")
cep16s.dna.uu.man <- mantel(cep16s.uu.dist,DNAconc.dist, permutations=999)
cep16s.dna.uu.man

### mantel test with 18s, need to first run 18s analyses
cep18s.uu.dist.sub <- dist_subset(cep18s.uu.dist,rownames(md18s.cep.sub))
cep16s.18s.uu.man <- mantel(cep16s.uu.dist,cep18s.uu.dist.sub, permutations=999)
cep16s.18s.uu.man


######### 16s Barchart ######
cep16st <- t(cep16s)

#### Bring in taxonomy 
row.names(cep16st) == row.names(cep16s.tax2)

##### Class level with top 8
cep16stc <- data.frame(Class=cep16s.tax2$Class,cep16st)
cep16stc$Class[is.na(cep16stc$Class)] <- "Unknown"
cep16stca <- aggregate(. ~ cep16stc$Class, cep16stc[,2:37], sum) 
row.names(cep16stca) <- cep16stca[,1]
cep16stca <- cep16stca[,2:37]

cep16stcao <- as.matrix(cep16stca[order(rowSums(cep16stca),decreasing = T),])

cep16stcaop <- cep16stcao[1:7,]
other <- colSums(cep16stcao[8:25,])
cep16stcaopo <- rbind(cep16stcaop,other)

safe_colorblind_palette8 <- c("#332288", "#CC6677", "#DDCC77","#44AA99", "#AA4499", "#6699CC", 
                              "#661100", "#888888")
## Figure 1B
barplot(cep16stcaopo,col=safe_colorblind_palette8,legend.text=T,axes=F,cex.names=.8,las=2,space=0,args.legend = list(x = "topleft", bty = "n", cex=0.6, inset=c(-0.1, -0.1)))


#### 16S Metacoder taxonomy heat trees ####
## Changed from tax2 to tax to parse taxa in a diff way to limit NA
cep16st.tax <- cbind(cep16s.tax,cep16st)
md16s.cep.named <- rownames_to_column(md16s.cep, var = "sample_id")
dim(cep16st) #[1] 707   36
dim(cep16st.tax) #[1] 707   38
cep16st.tax <- subset(cep16st.tax, select = -Confidence)
cep16st.tax <- subset(cep16st.tax, Taxon != "Unassigned")

## the taxonomy section is hard to handle because of formatting, we can use the `taxa` package to parse this and process the abundance data too
obj <- parse_tax_data(cep16st.tax,
                      class_cols = "Taxon", # the column that contains taxonomic information
                      class_sep = ";", # The character used to separate taxa in the classification
                      class_regex = "^(.+)_(.+)_(.+)$",
                      class_key = c(tax_rank1 = "info", # this is the key that labels each column pulled from the regex, since we had three sections to each identifier we need three columns
                                    tax_rank2 = "info2",
                                    tax_name = "taxon_name"))
#Calculate proportions
obj$data$tax_data <- metacoder::calc_obs_props(obj, "tax_data",
                                               cols = md16s.cep.named$sample_id)

# now we need to calculate abundances based on taxon not ASV
obj$data$tax_abund <- metacoder::calc_taxon_abund(obj, "tax_data",
                                                  cols = md16s.cep.named$sample_id)

#16S heatmap matrix by fluid color
obj$data$diff_table <- metacoder::compare_groups(obj, data = "tax_abund",
                                                 cols = md16s.cep.named$sample_id, # What columns of sample data to use
                                                 groups = md16s.cep.named$Fluid_color) # What category each sample is assigned to


obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method = "fdr")

#lets look at the p-values and see if there is any significance
range(obj$data$diff_table$wilcox_p_value, finite = TRUE)
# [1] 0.01563288 1.00000000
#the lower range is significant

## Focus only on significant taxa
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0


set.seed(1)
#this code takes 3 min on my computer
diff_heattree_color <- metacoder::heat_tree_matrix(obj, data = "diff_table",
                                                   node_size = n_obs, 
                                                   node_label = taxon_names,
                                                   node_color = log2_median_ratio,
                                                   node_color_range = diverging_palette(),
                                                   node_color_trans = "linear", 
                                                   node_color_interval = c(-3, 3), 
                                                   edge_color_interval = c(-3, 3), 
                                                   node_size_axis_label = "Number of ASVs",
                                                   node_color_axis_label = "Log2 ratio median proportions",
                                                   layout = "davidson-harel", 
                                                   initial_layout = "reingold-tilford")
## This plot takes a while to load
print(diff_heattree_color) ## Show taxonomic heat tree 

#look at site for 16S
# Compare many groups in a matrix of heat trees, here I am comparing taxa abundance by Site
obj$data$diff_table2 <- metacoder::compare_groups(obj, data = "tax_abund",
                                                  cols = md16s.cep.named$sample_id, # What columns of sample data to use
                                                  groups = md16s.cep.named$Site) # What category each sample is assigned to


obj$data$diff_table2$wilcox_p_value <- p.adjust(obj$data$diff_table2$wilcox_p_value, method = "fdr")

#lets look at the p-values and see if there is any significance
range(obj$data$diff_table2$wilcox_p_value, finite = TRUE)
# [1] 0.0721033 1.00000000
#not sig diff so didn't make a heat tree for site


### Comparison of C. follicularis and N. mirabilis 16S communities in Australia ####
md16s.aus <- subset(md16s, md16s$Location %in% c("NE_Australia","W_Australia"))
md16s.aus <- droplevels(md16s.aus)

rownames(asv16s.t) == rownames(md16s) ## double check

aus16s <- subset(asv16s.t, md16s$Location %in% c("NE_Australia","W_Australia"))
aus16s <- aus16s[,colSums(aus16s) > 0] ## remove ASVs no longer present
summary(rowSums(aus16s))
summary(colSums(aus16s))
aus16s <- rrarefy(aus16s,1306) ## rarefy at 1306
aus16s <- aus16s[,colSums(aus16s) > 0] ## remove ASVs no longer present

row.names(aus16s) == row.names(md16s.aus) ## double check

######### 16s Barchart - Australia ######
aus16st <- t(aus16s)

aus16s.tax <- subset(tax.16s2, row.names(tax.16s2) %in% colnames(aus16s))
aus16s.tax1 <- data.frame(Feature.ID=row.names(aus16s.tax),Taxon=aus16s.tax[,1])
aus16s.tax2 <- parse_taxonomy(aus16s.tax1)

#### Bring in taxonomy 
row.names(aus16st) == row.names(aus16s.tax2)

##### Class level with top 8
aus16stc <- data.frame(Class=aus16s.tax2$Class,aus16st)
aus16stc$Class[is.na(aus16stc$Class)] <- "Unknown"
aus16stca <- aggregate(. ~ aus16stc$Class, aus16stc[,2:57], sum) 
row.names(aus16stca) <- aus16stca[,1]
aus16stca <- aus16stca[,2:57]

aus16stcao <- as.matrix(aus16stca[order(rowSums(aus16stca),decreasing = T),])

aus16stcaop <- aus16stcao[1:7,]
other <- colSums(aus16stcao[8:24,])
aus16stcaopo <- rbind(aus16stcaop,other)

safe_colorblind_palette8 <- c("#332288", "#CC6677", "#DDCC77", "#44AA99", "#AA4499", "#6699CC", 
                              "#661100", "#888888")

barplot(aus16stcaopo,col=safe_colorblind_palette8,legend.text=T,axes=F,cex.names=.8,las=2,space=0,args.legend = list(x = "topleft", bty = "n", cex=0.6, inset=c(-0.1, -0.1)))



### Top 16S Cephalotus ASVs, with taxonomy #####
hist(colSums(cep16s))
hist(log(colSums(cep16s)))
cep16s.top <- data.frame(sort(colSums(cep16s),decreasing = T))
names(cep16s.top) <- "Sequences"
cep16s.top.tax <- subset(tax.16s2, row.names(tax.16s2) %in% row.names(cep16s.top))
cep16s.top.tax <- cep16s.top.tax[match(rownames(cep16s.top), rownames(cep16s.top.tax)), ]
row.names(cep16s.top.tax) == row.names(cep16s.top)
cep16s.top <- data.frame(cep16s.top,cep16s.top.tax)
# write.csv(cep16s.top,"Cephalotus_16S_ASVs_by_abundance.csv", quote = F, row.names = T)

####look at top 20 asv abundance taxonomy in 16S ceph
sum(cep16s.top$Sequences) #145404 16S reads post clean up
cep16s.top.20 <- cep16s.top[1:20,]
sum(cep16s.top.20$Sequences) #71235 16S reads in top 20

#top 20 asv reads divided by total reads
(71235/145404)*100 #48.99109

cep16s.top.20 <- tibble::rownames_to_column(cep16s.top.20, "Feature.ID")
cep16s.top.20.tax <- parse_taxonomy(cep16s.top.20)
cep16s.top.20.tax.abund <- cbind(cep16s.top.20.tax, cep16s.top.20)

#look at top 16S families
ceph.16S.burk <- subset(cep16s.top.20.tax.abund, Family == "Burkholderiaceae")
((sum(ceph.16S.burk$Sequences)/71235)*100) #39.671514%

ceph.16S.rhod <- subset(cep16s.top.20.tax.abund, Family == "Rhodanobacteraceae")
((sum(ceph.16S.rhod$Sequences)/71235)*100) #27.989055%

## Top taxa by site
cep16s.grr <- subset(cep16s, md16s.cep$Site %in% "GRR")
cep16s.grr.top <- data.frame(sort(colSums(cep16s.grr),decreasing = T))
names(cep16s.grr.top) <- "Sequences"
cep16s.grr.top.tax <- subset(tax.16s2, row.names(tax.16s2) %in% row.names(cep16s.grr.top))
cep16s.grr.top.tax <- cep16s.grr.top.tax[match(rownames(cep16s.grr.top), rownames(cep16s.grr.top.tax)), ]
row.names(cep16s.grr.top.tax) == row.names(cep16s.grr.top)
cep16s.grr.top <- data.frame(cep16s.grr.top,cep16s.grr.top.tax)

cep16s.tpb <- subset(cep16s, md16s.cep$Site %in% "TPB")
cep16s.tpb.top <- data.frame(sort(colSums(cep16s.tpb),decreasing = T))
names(cep16s.tpb.top) <- "Sequences"
cep16s.tpb.top.tax <- subset(tax.16s2, row.names(tax.16s2) %in% row.names(cep16s.tpb.top))
cep16s.tpb.top.tax <- cep16s.tpb.top.tax[match(rownames(cep16s.tpb.top), rownames(cep16s.tpb.top.tax)), ]
row.names(cep16s.tpb.top.tax) == row.names(cep16s.tpb.top)
cep16s.tpb.top <- data.frame(cep16s.tpb.top,cep16s.tpb.top.tax)

### Top bacterial ASV is the same in both sites, in both the second top ASV is a Rhodanobacter (but different ones!)


#### All 16S Comparison ####
summary(rowSums(asv16s.t))
asv16s.tr <- rrarefy(asv16s.t,1306)
asv16s.tr <- asv16s.tr[,colSums(asv16s.tr) > 0]

rownames(asv16s.tr) == rownames(md16s) ## double check

### Make new metadata that separates out Australian Nepenthes
md16s.mir <- subset(md16s, md16s$Project == "Mirabilis")
levels(md16s.mir$Type_2) <- c("Bog","Cephalotus","Leaf","Aus_Nepenthes","Sarracenia","Soil","Tube")
md16s2 <- md16s
md16s2$Type_2 <- as.character(md16s2$Type_2)
md16s.mir$Type_2 <- as.character(md16s.mir$Type_2)
md16s2[267:287,] <- md16s.mir
md16s2$Type_2 <- as.factor(md16s2$Type_2)

#### All 16S alpha diversity comparison ####
asv16s.shannon <- diversity(asv16s.tr)
asv16s.ef <- exp(asv16s.shannon)
summary(asv16s.ef)
md16s2$effective_species <- asv16s.ef

md16s2$Type_2 <- factor(md16s2$Type_2, levels = c("Cephalotus","Aus_Nepenthes", "Nepenthes", "Sarracenia", "Tube", "Leaf", "Bog", "Soil"))

ggplot(data=md16s2, aes(x=Type_2, y=effective_species, col=Type_2)) +
  geom_violin(trim = T, draw_quantiles = 0.5) +
  ylab("Effective species of bacteria") +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1, dotsize = .8) +
  theme_classic() + 
  scale_colour_manual(values = c("#0b9e03","#EE0000","#EE0000","#0080FF","#663300","#663300","#663300","#663300")) +
  theme(legend.position="none")

asv16s.kw <- kruskal.test(effective_species~Type_2, data = md16s2)
asv16s.kw

asv16s.pw <- pairwise.wilcox.test(md16s2$effective_species, md16s2$Type_2, p.adjust.method = 'BH')
asv16s.pw

#### All 16S iToL Preparation ####
##### Remove ASVs from each genus that are in less than 10% of the samples from each genus for iToL
asv16s.t100 <- asv16s.t[,colSums(asv16s.t) > 99]
summary(rowSums(asv16s.t100))
summary(colSums(asv16s.t100))
sort(rowSums(asv16s.t100))
asv16s.t100r <- rrarefy(asv16s.t100, 900)
asv16s.t100r <- asv16s.t100r[,colSums(asv16s.t100r) > 0]
summary(colSums(asv16s.t100r))

summary(md16s$Type_2)

Nep16 <- subset(asv16s.t100r, md16s$Type_2=="Nepenthes")
Sar16 <- subset(asv16s.t100r, md16s$Type_2=="Sarracenia")
Cep16 <- subset(asv16s.t100r, md16s$Type_2=="Cephalotus")

Cep16.t <- t(Cep16)
non0 <- apply(Cep16.t, 1, function(x) sum(x != 0))
Cep16.t[which(non0 <=4),] <- 0
Cep16.t.normln <- data.frame(log(rowSums(Cep16.t)/36 + 1))

Nep16.t <- t(Nep16)
non0 <- apply(Nep16.t, 1, function(x) sum(x != 0))
Nep16.t[which(non0 <=23),] <- 0
Nep16.t.normln <- data.frame(log(rowSums(Nep16.t)/231 + 1))

Sar16.t <- t(Sar16)
non0 <- apply(Sar16.t, 1, function(x) sum(x != 0))
Sar16.t[which(non0 <=14),] <- 0
Sar16.t.normln <- data.frame(log(rowSums(Sar16.t)/143 + 1))

BLS16 <- subset(asv16s.t100r, md16s$Type_2 %in% c("Bog","Leaf","Soil"))
BLS16.t <- t(BLS16)
# non0 <- apply(BLS16.t, 1, function(x) sum(x != 0))
# BLS16.t[which(non0 <=7),] <- 0
BLS16.t.normln <- data.frame(log(rowSums(BLS16.t)/65 + 1))

iToL16.table <- cbind(Cep16.t.normln,Nep16.t.normln,Sar16.t.normln,BLS16.t.normln)
iToL16.table <- iToL16.table[rowSums(iToL16.table) > 0,]
names(iToL16.table) <- c("Cephalotus","Nepenthes","Sarracenia","BogLeafSoil")

# write.csv(iToL16.table, "Cephalotus_comparison_iToL_16S_2-3_ASV100r.csv", quote = F, row.names = T)

#### Need to subset tree to the same ASVs
asv16s.tree$tip.label
iToL16.matrix <- as.matrix(t(iToL16.table))
asv16s.tree.iToL <- prune.sample(iToL16.matrix, asv16s.tree)
# write.tree(asv16s.tree.iToL,"sepp-tree-16S-iToL_2-3_ASV100r.nwk")

### Subset taxonomy too
iTol.16s.tax <- subset(tax.16s2, row.names(tax.16s2) %in% colnames(iToL16.matrix))
# write.csv(iTol.16s.tax, "Taxonomy_16S_iToL_2-3_ASV100r.csv", quote = F, row.names = T)


##### 18S Analysis #####
#### Read in 18S data, taxonomy, and tree ####
asv18s <- read.csv("Cephalotus_Project_18s_ASV_table.csv",head=T,row.names=1) #read in ASV table
summary(rowSums(asv18s))
summary(colSums(asv18s)) 

### Metadata
md18s <- subset(md, row.names(md) %in% colnames(asv18s))
colnames(asv18s) == row.names(md18s) # sanity check

### Tree
asv18s.tree <- read.tree("sepp-tree-18s.nwk")
asv18s.tree <- root(asv18s.tree, "02a14a1577b468489fed1e4f99b35dae") ## Root with an archaeon

### read in taxonomy
tax.18s <- read.csv("Cephalotus_Project_18s_taxonomy.csv", head=T, row.names = 1)
tax.18s <- subset(tax.18s, row.names(tax.18s) %in% row.names(asv18s))
row.names(asv18s) == row.names(tax.18s) # sanity check

#### Clean up data ####
### remove bacteria and archaea
tax.18s2 <- tax.18s[grep("Bacteria",tax.18s$Taxon, invert = T),]
tax.18s2 <- tax.18s2[grep("Archaea",tax.18s2$Taxon, invert = T),]

### Remove ASV that blasts to a bacterium
tax.18s2 <- tax.18s2[grep("d3dc4704e54fa849d21c42b883988e8d", row.names(tax.18s2), invert = T),]

asv18s <- subset(asv18s, row.names(asv18s) %in% row.names(tax.18s2))
sort(colSums(asv18s))

asv18s[asv18s < 10] <- 0 # each observation needs at least 10 seqs
asv18s <- asv18s[rowSums(asv18s) > 0,]
summary(rowSums(asv18s))
summary(colSums(asv18s)) 

asv18s <- asv18s[,colSums(asv18s) > 999] # each sample needs at least 1000 seqs

asv18s.t <- t(asv18s) # transpose rows and columns
asv18s.t <- asv18s.t[order(row.names(asv18s.t)),] # order samples alphabetically
asv18s.t <- asv18s.t[,order(colnames(asv18s.t))] # order asvs alphabetically

summary(rowSums(asv18s.t))
summary(colSums(asv18s.t)) 

md18s <- subset(md18s, row.names(md18s) %in% row.names(asv18s.t)) # One sample removed

##### 18S Cephalotus Analysis #####
#### Subset to Cephalotus
cep18s <- subset(asv18s.t, md18s$Type_2=="Cephalotus")
cep18s <- cep18s[,colSums(cep18s) > 0] ## remove ASVs no longer present
sum(colSums(cep18s)) # 668,485 sequences across 40 samples
summary(rowSums(cep18s)) # Mean of 16,712 seqs per sample
summary(colSums(cep18s))

### Cephalotus 18S rarefaction curve, for Supplementary Figure
rarecurve(cep18s, step = 10, label = FALSE, main = "18S rarefaction")
abline(v = 2704, col = "red", lwd = 2)

cep18s <- rrarefy(cep18s,2704) ## rarefy at higher level
cep18s <- cep18s[,colSums(cep18s) > 0] ## remove ASVs no longer present

md18s.cep <- subset(md18s, row.names(md18s) %in% row.names(cep18s))
md18s.cep <- droplevels(md18s.cep)

row.names(cep18s) == row.names(md18s.cep)

#### 18s Cephalotus alpha diversity ####
cep18s.shannon <- diversity(cep18s)
cep18s.ef <- exp(cep18s.shannon)
summary(cep18s.ef)
md18s.cep <- cbind(md18s.cep, effective_species = cep18s.ef)
md18s.cep <- within(md18s.cep, Fluid_color <- relevel(Fluid_color, ref = 2))
md18s.cep$Fluid_color

### glm18s. with a gamma distribution
glm18s.1.gamma <- glm(effective_species ~ pH + Volume + Site + Fluid_color + DNA_conc, family = Gamma(link="log"), data=md18s.cep)
summary(glm18s.1.gamma)

plot(allEffects(glm18s.1.gamma))

### Subset to match samples and add in 16S effective species
md18s.cep.sub <- subset(md18s.cep, row.names(md18s.cep) %in% row.names(md16s.cep))
row.names(md18s.cep.sub) == row.names(md16s.cep)
md18s.cep.sub <- cbind(md18s.cep.sub,es16s=md16s.cep$effective_species)


##### GLM used in the paper 
glm18s.2.gamma <- glm(effective_species ~ pH + Volume + Site + Fluid_color + DNA_conc + es16s, data=md18s.cep.sub, family = Gamma(link="log"))

summary(glm18s.2.gamma) ### Data for Table 1

plot(allEffects(glm18s.2.gamma)) ## For Supplementary Figure

### Checking the assumptions for 18S GLM with gamma
#goodness-of-fit of the model with a chi-square test based on the residual deviance and degrees of freedom
1 - pchisq(summary(glm18s.2.gamma)$deviance, summary(glm18s.2.gamma)$df.residual) 
#model does fit the data p>0.05

#Gamma model deviance
qchisq(0.95, df.residual(glm18s.2.gamma)) #38.88514
deviance(glm18s.2.gamma) #9.231094 =less than the 5% critical threshold, model fit is good

plot(glm18s.2.gamma)

check_gamma_model <- simulateResiduals(fittedModel = glm18s.2.gamma, n = 500)
plot(check_gamma_model)

###Assumptions for 18S Gamma GLM are met:
# Heteroscedastic, residuals are not normally distributed
#The data  are independently distributed, cases are independent.
#The dependent variable (not normally distributed), fits a a gamma distribution to model (continuous, non-zero, non-negative, positively skewed data)
#A GLM does NOT assume a linear relationship between the response variable and the explanatory variables, but it does assume a linear relationship between the transformed expected response in terms of the link function (log link in this case)
#Errors need to be independent but NOT normally distributed.  Our residuals vs fitted plot shows no relationship between the two


## Plot for Fig. 1C
ggplot(data=md18s.cep, aes(x=Site, y=effective_species)) +
  geom_violin(trim = F, draw_quantiles = 0.5) +
  ylab("Effective species of eukaryotes") +
  ylim(0,60) +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 0.8, dotsize = 1) +
  theme_classic()


### 18s Cephalotus NMDS ####

## Subset tree to relevant samples, put in format for phyloseq
cep18s.tree <- prune.sample(cep18s, asv18s.tree)
cep18s.tax <- subset(tax.18s2, row.names(tax.18s2) %in% colnames(cep18s))
cep18s.tax1 <- data.frame(Feature.ID=row.names(cep18s.tax),Taxon=cep18s.tax[,1])
cep18s.tax2 <- parse_taxonomy(cep18s.tax1)
cep18s.tax3 <- cbind(ASVs=row.names(cep18s.tax2),cep18s.tax2)

TAX.cep18s <- tax_table(as.matrix(cep18s.tax3))
OTU.cep18s = otu_table(as.matrix(cep18s), taxa_are_rows = FALSE)
SAM.cep18s = sample_data(md18s.cep)
cep18s.physeq <-merge_phyloseq(phyloseq(OTU.cep18s),SAM.cep18s,TAX.cep18s,cep18s.tree)

## Calculate unweighted Unifrac distance and run NMDS
cep18s.uu.dist <- distance(cep18s.physeq,"uUniFrac")
cep18s.uu.nmds <- metaMDS(cep18s.uu.dist, trymax=1000)

## Plot for Fig. 2B
plot(cep18s.uu.nmds$points[,1:2], xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     main="Cephalotus eukaryotic community by site", 
     col= c("red","blue")[md18s.cep$Site],
     pch=19)
legend("topleft", 
       legend=c("GRR","TPB"),
       col= c("red","blue"),
       pch=19,
       cex=0.8,
       bty = "n")
ordispider(cep18s.uu.nmds,groups = md18s.cep$Site, show.groups = "GRR", col = "red")
ordispider(cep18s.uu.nmds,groups = md18s.cep$Site, show.groups = "TPB", col = "blue")


ordi.18s.cep.dnaconc <- ordisurf(cep18s.uu.nmds,md18s.cep$DNA_conc, add=F, col="darkgray")
summary(ordi.18s.cep.dnaconc)

ordiplot(cep18s.uu.nmds, type = "t",display = "sites",cex = .6) # Plot that shows names. Change cex to make names bigger or smaller

pH18s.f <- as.factor(md18s.cep$pH)
## Plot for Fig. 2D
plot(cep18s.uu.nmds$points[,1:2], xlab="NMDS Axis 1", ylab="NMDS Axis 2", 
     main="Cephalotus eukaryotic community by pH", 
     col= rainbow(9)[pH18s.f],
     pch=c(19,17)[md18s.cep$Site])
legend("topleft", 
       legend=c("3.5","3.75","4","4.25","4.5","4.75","5","6.5","7"),
       col= rainbow(9),
       pch=19,
       cex=0.6,
       bty = "n")
ordi.18s.cep.pH <- ordisurf(cep18s.uu.nmds,md18s.cep$pH, add=T, col="darkgray")
summary(ordi.18s.cep.pH)


#### 18S Cephalotus adonis and mantel tests ####
cep18s.ad <- adonis(cep18s ~ Fluid_color * Site, data=md18s.cep)
cep18s.ad

pH18s.dist <- vegdist(md18s.cep$pH,method="euclidean")
cep18s.pH.uu.man <- mantel(cep18s.uu.dist,pH18s.dist, method = "spearman", permutations=999)
cep18s.pH.uu.man

vol18s.dist <- vegdist(md18s.cep$Volume,method="euclidean")
cep18s.vol.uu.man <- mantel(cep18s.uu.dist,vol18s.dist, method = "spearman", permutations=999)
cep18s.vol.uu.man

DNAconc18s.dist <- vegdist(md18s.cep$DNA_conc,method="euclidean")
cep18s.dna.uu.man <- mantel(cep18s.uu.dist,DNAconc18s.dist, permutations=999)
cep18s.dna.uu.man

######### 18s Barchart ######
cep18st <- t(cep18s)

#### Check taxonomy
row.names(cep18st) == row.names(cep18s.tax2)

##### Order level with top 8 
cep18sto <- data.frame(Order=cep18s.tax2$Order,cep18st)
cep18sto$Order[is.na(cep18sto$Order)] <- "Unlisted"
cep18stoa <- aggregate(. ~ cep18sto$Order, cep18sto[,2:40], sum) 
row.names(cep18stoa) <- cep18stoa[,1]
cep18stoa <- cep18stoa[,2:40]
cep18stoao <- as.matrix(cep18stoa[order(rowSums(cep18stoa),decreasing = T),])
cep18stoaop <- cep18stoao[1:7,]
other <- colSums(cep18stoao[8:16,])
cep18stoaopo <- rbind(cep18stoaop,other)

customcol8 <- c("darkred","tomato1","palegoldenrod","cadetblue4","cyan","darkblue","purple","gray")

### Plot for Figure 1B
barplot(cep18stoaopo,col=customcol8,legend.text=T,axes=F,cex.names=.8,las=2,space=0,args.legend = list(x = "topleft", bty = "n", cex=0.6, inset=c(-0.1, -0.1)))


#### 18S Metacoder heat tree  #####
## Same analyses as for 16S

cep18st.tax <- cbind(cep18s.tax1,cep18st)
md18s.cep.named <- rownames_to_column(md18s.cep, var = "sample_id")

cep18st.tax <- subset(cep18st.tax, Taxon != "Unassigned")

# the taxonomy section is hard to handle because of formatting, we can use the `taxa` package to parse this and process the abundance data too
cep18.obj <- parse_tax_data(cep18st.tax,
                            class_cols = "Taxon", # the column that contains taxonomic information
                            class_sep = ";", # The character used to separate taxa in the classification
                            class_regex = "^(.+)_(.+)_(.+)$",
                            class_key = c(tax_rank1 = "info", # this is the key that labels each column pulled from the regex, since we had three sections to each identifier we need three columns
                                          tax_rank2 = "info2",
                                          tax_name = "taxon_name"))


#This data is rarefied but I still did proportions.
cep18.obj$data$tax_data <- metacoder::calc_obs_props(cep18.obj, "tax_data",
                                                     cols = md18s.cep.named$sample_id)

# now we need to calculate abundances based on taxon not ASV
cep18.obj$data$tax_abund <- metacoder::calc_taxon_abund(cep18.obj, "tax_data",
                                                        cols = md18s.cep.named$sample_id)

#Heat matrix between fluid color
# Compare many groups in a matrix of heat trees, here I am comparing taxa abundance by fluid color
cep18.obj$data$diff_table <- metacoder::compare_groups(cep18.obj, data = "tax_abund",
                                                       cols = md18s.cep.named$sample_id, # What columns of sample data to use
                                                       groups = md18s.cep.named$Fluid_color) # What category each sample is assigned to


cep18.obj$data$diff_table$wilcox_p_value <- p.adjust(cep18.obj$data$diff_table$wilcox_p_value, method = "fdr")

#lets look at the p-values and see if there is any significance
range(cep18.obj$data$diff_table$wilcox_p_value, finite = TRUE)
# [1] 0.03216378 1.00000000 
#the lower range is significant

## Focus only on significant taxa
cep18.obj$data$diff_table$log2_median_ratio[cep18.obj$data$diff_table$wilcox_p_value > 0.05] <- 0

set.seed(1)
diff_heattree_color_18S <- metacoder::heat_tree_matrix(cep18.obj, data = "diff_table",
                                                       node_size = n_obs, 
                                                       node_label = taxon_names,
                                                       node_color = log2_median_ratio,
                                                       node_color_range = diverging_palette(),
                                                       node_color_trans = "linear", 
                                                       node_color_interval = c(-3, 3), 
                                                       edge_color_interval = c(-3, 3), 
                                                       node_size_axis_label = "Number of ASVs",
                                                       node_color_axis_label = "Log2 ratio median proportions",
                                                       layout = "davidson-harel", 
                                                       initial_layout = "reingold-tilford")
print(diff_heattree_color_18S) ## Show taxonomic heat tree 

## 18S site
# Compare many groups in a matrix of heat trees, here I am comparing taxa abundance by site
cep18.site <- parse_tax_data(cep18st.tax,
                             class_cols = "Taxon", # the column that contains taxonomic information
                             class_sep = ";", # The character used to separate taxa in the classification
                             class_regex = "^(.+)_(.+)_(.+)$",
                             class_key = c(tax_rank1 = "info", # this is the key that labels each column pulled from the regex, since we had three sections to each identifier we need three columns
                                           tax_rank2 = "info2",
                                           tax_name = "taxon_name"))


cep18.site$data$tax_data <- metacoder::calc_obs_props(cep18.site, "tax_data",
                                                      cols = md18s.cep.named$sample_id)

# now we need to calculate abundances based on taxon not ASV
cep18.site$data$tax_abund <- metacoder::calc_taxon_abund(cep18.site, "tax_data",
                                                         cols = md18s.cep.named$sample_id)

cep18.site$data$diff_table <- metacoder::compare_groups(cep18.site, data = "tax_abund",
                                                        cols = md18s.cep.named$sample_id, # What columns of sample data to use
                                                        groups = md18s.cep.named$Site) # What category each sample is assigned to

cep18.site$data$diff_table$wilcox_p_value <- p.adjust(cep18.site$data$diff_table$wilcox_p_value, method = "fdr")

#lets look at the p-values and see if there is any significance
range(cep18.site$data$diff_table$wilcox_p_value, finite = TRUE)
# [1] 0.04379435 1.00000000 
#the lower range is significant

## Focus only on significant taxa
cep18.site$data$diff_table$log2_median_ratio[cep18.site$data$diff_table$wilcox_p_value > 0.05] <- 0

set.seed(1)
diff_heattree_site_18S <- heat_tree(cep18.site, node_label = taxon_names,
                                    node_size = n_obs, # n_obs is a function that calculates, in this case, the number of OTUs per taxon
                                    node_color = log2_median_ratio, # A column from `cep18.obj.site$data$diff_table`
                                    node_color_interval = c(-3, 3), # The range of `log2_median_ratio` to display
                                    node_color_range = diverging_palette(), # The color palette used
                                    node_size_axis_label = "Number of ASVs",
                                    node_color_axis_label = "Log 2 ratio of median proportions",
                                    layout = "davidson-harel", # The primary layout algorithm
                                    initial_layout = "reingold-tilford") # The layout algorithm that initializes node locations

print(diff_heattree_site_18S) ## Show taxonomic heat tree 


### Comparison of C. follicularis and N. mirabilis 18s communities in Australia ####
md18s.aus <- subset(md18s, md18s$Location %in% c("NE_Australia","W_Australia"))
md18s.aus <- droplevels(md18s.aus)

rownames(asv18s.t) == rownames(md18s) ## double check

aus18s <- subset(asv18s.t, md18s$Location %in% c("NE_Australia","W_Australia"))
aus18s <- aus18s[,colSums(aus18s) > 0] ## remove ASVs no longer present
summary(rowSums(aus18s))
summary(colSums(aus18s))
aus18s <- rrarefy(aus18s,1556) ## rarefy at 1556
aus18s <- aus18s[,colSums(aus18s) > 0] ## remove ASVs no longer present

row.names(aus18s) == row.names(md18s.aus) ## double check


######### 18s Barchart - Australia ######
aus18st <- t(aus18s)

aus18s.tax <- subset(tax.18s2, row.names(tax.18s2) %in% colnames(aus18s))
aus18s.tax1 <- data.frame(Feature.ID=row.names(aus18s.tax),Taxon=aus18s.tax[,1])
aus18s.tax2 <- parse_taxonomy(aus18s.tax1)

#### Bring in taxonomy 
row.names(aus18st) == row.names(aus18s.tax2)

##### Order level with top 8
aus18sto <- data.frame(Order=aus18s.tax2$Order,aus18st)
aus18sto$Order[is.na(aus18sto$Order)] <- "Unknown"
aus18stoa <- aggregate(. ~ aus18sto$Order, aus18sto[,2:59], sum) 
row.names(aus18stoa) <- aus18stoa[,1]
aus18stoa <- aus18stoa[,2:59]

aus18stoao <- as.matrix(aus18stoa[order(rowSums(aus18stoa),decreasing = T),])

aus18stoaop <- aus18stoao[1:7,]
other <- colSums(aus18stoao[8:18,])
aus18stoaopo <- rbind(aus18stoaop,other)

customcol8 <- c("darkred","tomato1","palegoldenrod","cadetblue4","cyan","darkblue","purple","gray")

## Plot for supplementary Figure 4
barplot(aus18stoaopo,col=customcol8,legend.text=T,axes=F,cex.names=.8,las=2,space=0,args.legend = list(x = "topleft", bty = "n", cex=0.6, inset=c(-0.1, -0.1)))

### Top 18s Cephalotus ASVs, with taxonomy #####
hist(colSums(cep18s))
hist(log(colSums(cep18s)))
cep18s.top <- data.frame(sort(colSums(cep18s),decreasing = T))
names(cep18s.top) <- "Sequences"
cep18s.top.tax <- subset(tax.18s2, row.names(tax.18s2) %in% row.names(cep18s.top))
cep18s.top.tax <- cep18s.top.tax[match(rownames(cep18s.top), rownames(cep18s.top.tax)), ]
row.names(cep18s.top.tax) == row.names(cep18s.top)
cep18s.top <- data.frame(cep18s.top,cep18s.top.tax)
# write.csv(cep18s.top,"Cephalotus_18s_ASVs_by_abundance.csv", quote = F, row.names = T)

#### look at top 20 asv abundance taxonomy in 18S Ceph
dim(cep18s.top) #526 ASV
sum(cep18s.top$Sequences) #108160 18S reads post clean up
cep18s.top.20 <- cep18s.top[1:20,]
sum(cep18s.top.20$Sequences)#63833 16S reads in top 20

#top 20 asv reads divided by total reads
(63833/108160)*100 #59.0172

cep18s.top.20 <- tibble::rownames_to_column(cep18s.top.20, "Feature.ID")
cep18s.top.20.tax <- parse_taxonomy(cep18s.top.20)
cep18s.top.20.tax.abund <- cbind(cep18s.top.20.tax, cep18s.top.20)

#look at top 18S groups
ceph.18S.metaz <- subset(cep18s.top.20.tax.abund, Order == "Metazoa (Animalia)")
((sum(ceph.18S.metaz$Sequences)/63833)*100) #41.27332%

ceph.18S.fungi <- subset(cep18s.top.20.tax.abund, Order == "Fungi")
((sum(ceph.18S.fungi$Sequences)/63833)*100) #40.3553%

### Top taxa by site
cep18s.grr <- subset(cep18s, md18s.cep$Site %in% "GRR")
cep18s.grr.top <- data.frame(sort(colSums(cep18s.grr),decreasing = T))
names(cep18s.grr.top) <- "Sequences"
cep18s.grr.top.tax <- subset(tax.18s2, row.names(tax.18s2) %in% row.names(cep18s.grr.top))
cep18s.grr.top.tax <- cep18s.grr.top.tax[match(rownames(cep18s.grr.top), rownames(cep18s.grr.top.tax)), ]
row.names(cep18s.grr.top.tax) == row.names(cep18s.grr.top)
cep18s.grr.top <- data.frame(cep18s.grr.top,cep18s.grr.top.tax)

cep18s.tpb <- subset(cep18s, md18s.cep$Site %in% "TPB")
cep18s.tpb.top <- data.frame(sort(colSums(cep18s.tpb),decreasing = T))
names(cep18s.tpb.top) <- "Sequences"
cep18s.tpb.top.tax <- subset(tax.18s2, row.names(tax.18s2) %in% row.names(cep18s.tpb.top))
cep18s.tpb.top.tax <- cep18s.tpb.top.tax[match(rownames(cep18s.tpb.top), rownames(cep18s.tpb.top.tax)), ]
row.names(cep18s.tpb.top.tax) == row.names(cep18s.tpb.top)
cep18s.tpb.top <- data.frame(cep18s.tpb.top,cep18s.tpb.top.tax)

### Top eukaryotic taxa are fungi and metazoa, different top ASV at each site

#### All 18s Comparison ####
summary(rowSums(asv18s.t))
asv18s.tr <- rrarefy(asv18s.t,1073)
asv18s.tr <- asv18s.tr[,colSums(asv18s.tr) > 0]

rownames(asv18s.tr) == rownames(md18s) ## double check

### Make new metadata that separates out Australian Nepenthes
md18s.mir <- subset(md18s, md18s$Project == "Mirabilis")
levels(md18s.mir$Type_2) <- c("Bog","Cephalotus","Leaf","Aus_Nepenthes","Sarracenia","Soil","Tube")
md18s2 <- md18s
md18s2$Type_2 <- as.character(md18s2$Type_2)
md18s.mir$Type_2 <- as.character(md18s.mir$Type_2)
md18s2[257:274,] <- md18s.mir
md18s2$Type_2 <- as.factor(md18s2$Type_2)

#### All 18s alpha diversity comparison ####
asv18s.shannon <- diversity(asv18s.tr)
asv18s.ef <- exp(asv18s.shannon)
summary(asv18s.ef)
md18s2$effective_species <- asv18s.ef

md18s2$Type_2 <- factor(md18s2$Type_2, levels = c("Cephalotus","Aus_Nepenthes", "Nepenthes", "Sarracenia", "Tube", "Leaf", "Bog", "Soil"))

ggplot(data=md18s2, aes(x=Type_2, y=effective_species, col=Type_2)) +
  geom_violin(trim = T, draw_quantiles = 0.5) +
  ylab("Effective species of eukaryotes") +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1, dotsize = .3) +
  theme_classic() + 
  scale_colour_manual(values = c("#0b9e03","#EE0000","#EE0000","#0080FF","#663300","#663300","#663300","#663300")) +
  theme(legend.position="none")

asv18s.kw <- kruskal.test(effective_species~Type_2, data = md18s2)
asv18s.kw

asv18s.pw <- pairwise.wilcox.test(md18s2$effective_species, md18s2$Type_2, p.adjust.method = 'BH')
asv18s.pw

#### All 18s iToL Preparation ####
##### Remove ASVs from each genus that are in less than 10% of the samples from each genus for iToL
asv18s.t100 <- asv18s.t[,colSums(asv18s.t) > 99]
summary(rowSums(asv18s.t100))
summary(colSums(asv18s.t100))
sort(rowSums(asv18s.t100))
asv18s.t100r <- rrarefy(asv18s.t100, 900)
asv18s.t100r <- asv18s.t100r[,colSums(asv18s.t100r) > 0]
summary(colSums(asv18s.t100r))

summary(md18s$Type_2)

Nep18 <- subset(asv18s.t100r, md18s$Type_2=="Nepenthes")
Sar18 <- subset(asv18s.t100r, md18s$Type_2=="Sarracenia")
Cep18 <- subset(asv18s.t100r, md18s$Type_2=="Cephalotus")

Cep18.t <- t(Cep18)
non0 <- apply(Cep18.t, 1, function(x) sum(x != 0))
Cep18.t[which(non0 <=4),] <- 0
Cep18.t.normln <- data.frame(log(rowSums(Cep18.t)/40 + 1))

Nep18.t <- t(Nep18)
non0 <- apply(Nep18.t, 1, function(x) sum(x != 0))
Nep18.t[which(non0 <=22),] <- 0
Nep18.t.normln <- data.frame(log(rowSums(Nep18.t)/218 + 1))

Sar18.t <- t(Sar18)
non0 <- apply(Sar18.t, 1, function(x) sum(x != 0))
Sar18.t[which(non0 <=14),] <- 0
Sar18.t.normln <- data.frame(log(rowSums(Sar18.t)/143 + 1))

BLS18 <- subset(asv18s.t100r, md18s$Type_2 %in% c("Bog","Leaf","Soil"))
BLS18.t <- t(BLS18)
BLS18.t.normln <- data.frame(log(rowSums(BLS18.t)/61 + 1))

iToL18.table <- cbind(Cep18.t.normln,Nep18.t.normln,Sar18.t.normln,BLS18.t.normln)
iToL18.table <- iToL18.table[rowSums(iToL18.table) > 0,]
names(iToL18.table) <- c("Cephalotus","Nepenthes","Sarracenia","BogLeafSoil")

# write.csv(iToL18.table, "Cephalotus_comparison_iToL_18s_2-3_ASV100r.csv", quote = F, row.names = T)

#### Need to subset tree to the same ASVs
asv18s.tree$tip.label
iToL18.matrix <- as.matrix(t(iToL18.table))
asv18s.tree.iToL <- prune.sample(iToL18.matrix, asv18s.tree)
# write.tree(asv18s.tree.iToL,"sepp-tree-18s-iToL_2-3_ASV100r.nwk")

### Subset taxonomy too
iTol.18s.tax <- subset(tax.18s2, row.names(tax.18s2) %in% colnames(iToL18.matrix))
# write.csv(iTol.18s.tax, "Taxonomy_18s_iToL_2-3_ASV100r.csv", quote = F, row.names = T)


